home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power Programmierung
/
Power-Programmierung CD 2 (Tewi)(1994).iso
/
doc
/
mir
/
a_bytes.c
< prev
next >
Wrap
Text File
|
1992-07-02
|
12KB
|
367 lines
/*
* usage: a_bytes [ /L ] file_name[s]
*
* A_BYTES Analyze the bytes (characters) used within any file, report
* the frequency of each byte present. If the location flag /L
* is set, include offsets of the first 8 occurrences of each
* byte pattern present.
*
* input: Any file[s] whatsoever.
*
* output: file_name.BYT which contains up to 256 lines, one line for
* each different byte present. The byte is shown first in
* printable OR octal form, then the hexadecimal equivalent.
* The third column is frequency. The fourth column shows
* percentage of total occurrences within the file.
*
* If the /L locations option is selected, the output file is
* name file_name.LOC and the offsets of the first up to 8
* occurrences follow at the end of each line.
*
* writeup: MIR TUTORIAL ONE, topic 5
* Compiled with STACK = 16000
*
* written: Douglas Lowry Jan 04 92
* modified: Douglas Lowry Feb 15 92
* Mar 20 92 Ten alternative report names
* Copyright (C) 1992 Marpex Inc.
*
* The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
* usage and co-ordination of the MIR family of programs to analyze,
* prepare and index databases (small through gigabyte size), and
* how to build integrated retrieval software around the MIR search
* engine. The fifth of the five MIR tutorial series explains how
* to extend indexing capability into leading edge search-related
* technologies. For more information, GO IBMPRO on CompuServe;
* MIR files are in the DBMS library. The same files are on the
* Canada Remote Systems BBS. A diskette copy of the Introduction
* is available by mail ($10 US... check, Visa or Mastercard);
* diskettes with Introduction, Tutorial ONE software and the
* shareware Tutorial ONE text cost $29. Shareware registration
* for a tutorial is also $29.
*
* E-mail...
* Compuserve 71431,1337
* Internet doug.lowry%canrem.com
* UUCP canrem!doug.lowry
* Others: doug.lowry@canrem.uucp
*
* FAX... 416 963-5677
*
* "Snail mail"... Douglas Lowry, Ph.D.
* Marpex Inc.
* 5334 Yonge Street, #1102
* North York, Ontario
* Canada M2N 6M2
*
* Related database consultation and preparation services are
* available through:
* Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
* North York, Ontario Canada M2J 4Z7
* Tel. 416 492-3838 FAX 416 492-3843
*
* This program is free software; you may redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* (file 05LICENS) along with this program; if not, write to the
* Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
* USA.
*/
#include <stdio.h>
#include <fcntl.h>
#include <sys\types.h>
#include <sys\stat.h>
#include <io.h>
#define repeat for(;;)
typedef enum bool
{ FALSE = 0, TRUE = 1 } Bool ;
#define INTAKE 2048 /* # of bytes in input buffer */
void process(), Usage_(), report(), non_exist() ;
char *Cmdname_() { return( "a_bytes" ) ; }
/*
* MAIN -
*/
main( argc, argv )
int argc;
char **argv;
{
Bool foul_up,
locations ; /* requested by user */
char c10; /* argv[1][0] */
int fd, /* file descriptor */
file, bgn_at ;
long int cum[ 256 ], /* accumulator for each byte */
locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */
locations = FALSE ;
if( argv[1][1] == 'l' )
argv[1][1] = 'L' ;
c10 = argv[1][0] ;
if(( c10 == '-' || c10 == '/' ) && argv[1][1] == 'L' )
locations = TRUE ;
else
{
if( argc == 1 || c10 == '-' || c10 == '/' || c10 == '?' )
Usage_();
}
bgn_at = 1 ;
if( locations )
bgn_at = 2 ;
for( file = bgn_at ; file < argc ; file++ )
{
if(( fd = open( argv[ file ], O_RDONLY | O_BINARY )) == -1 )
{
fprintf( stderr, "Can't open file %s\n", argv[ file ] );
exit( 1 );
}
process( fd, cum, locn, locations );
report( cum, locn, locations, argv[file] ) ;
if( close( fd ))
fprintf( stderr, "Problem closing %s\n", argv[ file ] );
}
exit( 0 );
}
void
Usage_()
{
fprintf( stderr, "\nusage: %s [ /L ] file_name[s]\n\n\
Analyze the bytes (characters) used within any file, report\n\
the frequency of each byte present. If the location flag /L\n\
is set, include offsets of the first 8 occurrences of each\n",
Cmdname_() );
fprintf( stderr, " byte pattern present.\n\n\
input: Any file[s] whatsoever.\n\n\
output: file_name.BYT which contains up to 256 lines, one line for\n\
each different byte present. The byte is shown first in\n\
printable OR octal form, then the hexadecimal equivalent.\n" );
fprintf( stderr,
" The third column is frequency. The fourth column shows\n\
percentage of total occurrences within the file.\n\n\
If the /L locations option is selected, the output file is\n\
name file_name.LOC and the offsets of the first up to 8\n\
occurrences follow at the end of each line.\n\n" ) ;
fprintf( stderr, "writeup: MIR TUTORIAL ONE, topic 5\n\n" );
exit( 1 ) ;
}
/*
* PROCESS
*/
void
process( fd, cum, locn, locations )
int fd; /* file descriptor */
long int cum[ 256 ], /* accumulator for each byte */
locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */
Bool locations ; /* TRUE if selected by user */
{
unsigned char buf_in[ INTAKE ];
register int buf_len,
i;
long int gross_locn, /* count of cumulative intakes */
fine_locn,
ct ;
for( i = 0; i < 256 ; i++ )
cum[ i ] = 0;
gross_locn = 0 ;
repeat
{
fine_locn = 0 ;
if ( ( buf_len = read( fd, buf_in, INTAKE ) ) == 0 )
break;
for ( i= 0 ; i < buf_len ; i++ )
{
cum[ buf_in[ i ] ]++ ;
if( locations && cum[ buf_in[ i ] ] < 9 )
{
ct = cum[ buf_in[ i ] ] - 1 ;
locn[ buf_in [ i ] ][ ct ] = gross_locn + fine_locn ;
}
fine_locn++ ;
}
gross_locn += buf_len ;
}
return;
}
/*
* REPORT - Output the data for analysis of one file
*/
void
report( cum, locn, locations, name_in )
long int cum[ 256 ], /* accumulator for each byte */
locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */
Bool locations ; /* TRUE if selected by user */
char name_in[] ;
{
FILE *fp_out ;
char fname[20];
unsigned char c;
double pct, /* % of occurrences */
f_grand; /* grand total bytes */
Bool foul_up ;
int result,
len, i, j ;
long int grand_total,
limit; /* up to 8 are tracked */
if( locations )
sprintf( fname, "%s.loc", name_in );
else
sprintf( fname, "%s.byt", name_in );
len = strlen( fname ) ;
for( i = 0 ; i < len ; i++ )
{
if( fname[i] == '.' )
{
if( i != len - 3 )
{
if( locations )
strncpy( &fname[i+1], "loc", 3 );
else
strncpy( &fname[i+1], "byt", 3 );
}
fname[i+4] = '\0' ;
break ;
}
}
non_exist( fname ) ;
if(( fp_out = fopen( fname, "w" )) == NULL )
{
fprintf( stderr, "Can't open file %s\n", fname );
return ;
}
grand_total = 0;
foul_up = FALSE ;
for( i = 0; i < 256 ; i++ )
grand_total += cum[ i ];
f_grand = ( double ) grand_total ;
for( i = 0; i < 256 ; i++ )
{
if( cum[ i ] )
{
c = ( unsigned char ) i;
pct = 100.0 * ( ( double ) cum[ i ] / f_grand ) ;
/* For Unix version, next line should read
if( i < 0x21 || i > 0x7e ) */
if( i < 0x21 || i == 0x7f || i == 0xff )
{
if( !fprintf( fp_out, "\\%03o [%02X]%7ld %4.1f%%",
i, i, cum[ i ], pct ) )
foul_up = TRUE ;
}
else
{
if( !fprintf( fp_out, "%c [%02X]%7ld %4.1f%%",
c, i, cum[ i ], pct ))
foul_up = TRUE ;
}
if( locations )
{
fputc( ' ', fp_out );
fputc( ' ', fp_out );
limit = 8;
if( cum[i] < 8 )
limit = cum[i] ;
for( j = 0 ; j < limit ; j++ )
fprintf( fp_out, " %ld", locn[ i ][ j ] );
}
fputc( '\n', fp_out );
}
}
if( foul_up )
fprintf( stderr, "Unable to write report in file %s\n", fname );
else
fprintf( stderr,
"\n\nInput size = %ld bytes. Results are in file %s\n\n",
grand_total, fname );
if( fclose( fp_out ))
fprintf( stderr, "Problem closing %s\n", fname );
return ;
}
/*
* NON_EXIST Test the existence of a file; if it exists,
* substitute digits successively for the last
* byte in the name until a non-existent file
* is named, or until the last digit is '9'; do
* the same with the second last byte, for 100
* possible combinations
*/
void
non_exist( fname )
char fname[] ;
{
struct stat buf;
Bool gotcha ;
int result,
decade, /* batch of 10 names */
len, i ;
result = stat( fname, &buf );
if( !result ) /* data obtained = that file exists */
{
len = strlen( fname ) ;
for( decade = 0 ; decade < 10 ; decade++ )
{
gotcha = FALSE ;
if( decade )
{
if( fname[len-2] == '.' )
break ; /* Don't mess with one digit
name extension */
fname[ len - 2 ] = '0' + decade ;
}
fname[ len - 1 ] = '0' ;
/* Try names ending in 0 through 9 */
for( i = 0 ; i < 10 ; i++ )
{
result = stat( fname, &buf );
if( result )
{
gotcha = TRUE ;
break ;
}
if( i < 9 )
fname[ len - 1 ] += 1 ;
}
if( gotcha )
break ;
}
}
return ;
}